Predicting New Construction in Philadelphia

MUSA 508 Final Project

Author

Laura Frances and Nissim Lebovits

Published

December 3, 2023

1 Summary

2 Introduction

Show the code
required_packages <- c("tidyverse", "sf", "acs", "tidycensus", "sfdep", "kableExtra", "conflicted",
                       "gganimate", "tmap", "gifski", "transformr", "ggpubr", "randomForest", "janitor",
                       'igraph')
install_and_load_packages(required_packages)

source("utils/viz_utils.R")

select <- dplyr::select
filter <- dplyr::filter
lag <- dplyr::lag

options(scipen = 999, tigris_use_cache = TRUE, tigris_class = 'sf')

crs <- 'epsg:2272'
  1. Predict development pressure: how do we define “a lot of development”?

  2. Define affordability burden: how do we define “affordability burden”? – % change year over year in population that is experience rate burden (will probably see extreme tipping points), growing population, % change in area incomes

  3. Identify problem zoning

  4. Calculate number of connected parcels

  5. Predict development pressure at the block level

  6. Identify not burdened areas

  7. Identify problem zoning

  8. Calcualte number of connected parcels

  9. Advocate for upzoning in parcels where there is local development pressure, no affordability burden, problem zoning, and high number of connected parcels

  • transit
  • zoning (OSM)
  • land use (OSM)
Show the code
urls <- c(
  phl = "https://opendata.arcgis.com/datasets/405ec3da942d4e20869d4e1449a2be48_0.geojson",
  phl_bgs = "https://opendata.arcgis.com/datasets/2f982bada233478ea0100528227febce_0.geojson",
  nbhoods = "https://raw.githubusercontent.com/opendataphilly/open-geo-data/master/Neighborhoods_Philadelphia/Neighborhoods_Philadelphia.geojson",
  roads = 'https://opendata.arcgis.com/datasets/261eeb49dfd44ccb8a4b6a0af830fdc8_0.geojson',
  historic_districts = "https://phl.carto.com/api/v2/sql?q=SELECT+*+FROM+historicdistricts_local&filename=historicdistricts_local&format=geojson&skipfields=cartodb_id",
  zoning = "https://opendata.arcgis.com/datasets/0bdb0b5f13774c03abf8dc2f1aa01693_0.geojson",
  overlays = "https://opendata.arcgis.com/datasets/04fd29a8c022471994900cb0fd791bfc_0.geojson",
  opa = "data/opa_properties.geojson",
  building_permits = building_permits_path,
  acs14 = acs_vars14_path,
  acs19 = acs_vars19_path,
  trolley_stops = "data/Trolley_Stations.geojson",
  subway_stops = "data/Highspeed_Stations.geojson"
)

suppressMessages({
  invisible(
    imap(urls, ~ assign(.y, phl_spat_read(.x), envir = .GlobalEnv))
  )
})

phl_bgs <- phl_bgs %>%
              select(GEOID10)

broad_and_market <- roads %>% filter(ST_NAME %in% c('BROAD',"MARKET") | SEG_ID %in% c(440370, 421347,421338,421337,422413,423051,440403,440402,440391,440380))

subway_stops <- subway_stops[phl, ]
transit_stops <- st_union(trolley_stops, subway_stops)

historic_districts <- historic_districts %>%
                        mutate(hist_dist = name) %>%
                        select(hist_dist)

nbhoods <- nbhoods %>%
            select(mapname)

overlays <- overlays %>% clean_names()
overlays$overlay_symbol <- gsub("/", "", overlays$overlay_symbol) 
overlays <- overlays %>%
              mutate(overlay_symbol = ifelse(overlay_symbol == "[NA]", "Other", overlay_symbol))
                        
building_permits <- building_permits %>%
                      filter(permittype %in% c("RESIDENTIAL BUILDING", "BP_ADDITION", "BP_NEWCNST"))

acs_reg_vars <- c(
  "GEOID",
  "Total_Pop", 
  "Med_Inc",
  "Percent_Nonwhite",
  "Percent_Renters",
  "Rent_Burden",
  "Ext_Rent_Burden"
  )

acs14_reg_vars <- phl_spat_read(acs_vars14_path) %>%
                    st_drop_geometry() %>%
                    select(acs_reg_vars) %>%
                    clean_names() %>%
                    rename(GEOID10 = geoid)

acs19_reg_vars <- phl_spat_read(acs_vars19_path) %>%
                    st_drop_geometry() %>%
                    select(acs_reg_vars) %>%
                    clean_names() %>%
                    rename(GEOID10 = geoid)

acs_tot <- acs14_reg_vars %>%
             left_join(acs19_reg_vars, by = "GEOID10", suffix = c("_14", "_19")) %>%
             mutate(pct_med_inc_change = med_inc_19 - med_inc_14 / med_inc_14,
                    pct_tot_pop_change = total_pop_19 - total_pop_14 / total_pop_14,
                    pct_renters_change = percent_renters_19 - percent_renters_14,
                    pct_rent_burdenchange = rent_burden_19 - rent_burden_14)
Show the code
ggplot(phl_bgs) +
  geom_sf() +
  theme_void()

Show the code
phl_bgs <- phl_spat_read("https://opendata.arcgis.com/datasets/2f982bada233478ea0100528227febce_0.geojson") %>%
              select(GEOID10)

# Create a complete grid of GEOID10 and year
geoid_years <- expand.grid(GEOID10 = unique(phl_bgs$GEOID10),
                           year = unique(building_permits$year))



# Joining your existing data with the complete grid
permits_bg <- st_join(phl_bgs, building_permits) %>%
              group_by(GEOID10, year) %>%
              summarize(permits_count = sum(permits_count, na.rm = TRUE)) %>%
              st_drop_geometry() %>%
              right_join(geoid_years, by = c("GEOID10", "year")) %>%
              replace_na(list(permits_count = 0)) %>%
              left_join(phl_bgs, by = "GEOID10") %>%
              st_as_sf() 


### spat + temp lags----------------------------------
suppressMessages(
permits_bg <- permits_bg %>%
              group_by(year) %>%
              mutate(nb = st_knn(geometry, 5),
                         wt = st_weights(nb),
                      lag_spatial = st_lag(permits_count, nb, wt)) %>% # calculate spat lag
              ungroup()%>%
              arrange(GEOID10, year) %>% # calculate time lag
               mutate(
                 lag_1_year = lag(permits_count, 1),
                 lag_2_years = lag(permits_count, 2),
                 lag_3_years = lag(permits_count, 3),
                 lag_4_years = lag(permits_count, 4),
                 lag_5_years = lag(permits_count, 5),
                 lag_6_years = lag(permits_count, 6),
                 lag_7_years = lag(permits_count, 7),
                 lag_8_years = lag(permits_count, 8),
                 lag_9_years = lag(permits_count, 9),
                 dist_to_2022 = 2022 - year
               )
)

### distance to transit---------------------------------
phl_bgs <- phl_bgs %>%
  rowwise() %>%
  mutate(
    dist_to_transit = as.numeric(min(st_distance(st_point_on_surface(geometry), transit_stops$geometry)))
  ) %>%
  ungroup()

### historic dists---------------------------------
hist_dists_x_bg <- st_join(phl_bgs, historic_districts) %>%
              mutate(hist_dist = ifelse(is.na(hist_dist), 0, 1))

hist_dists_x_bg <- st_join(phl_bgs, historic_districts) %>%
                      st_drop_geometry() %>%
                      mutate(hist_dist_present = 1) %>%
                      group_by(GEOID10, hist_dist) %>%
                      summarize(hist_dist_present = max(hist_dist_present, na.rm = TRUE), .groups = 'drop') %>%
                      pivot_wider(names_from = "hist_dist", values_from = "hist_dist_present", 
                                  names_prefix = "hist_dist_", values_fill = list("hist_dist_present" = 0))

phl_bgs <- left_join(phl_bgs, hist_dists_x_bg, by = "GEOID10")

### hoods---------------------------------              
intersection <- st_intersection(phl_bgs, nbhoods)
intersection$intersection_area <- st_area(intersection)
max_intersection <- intersection %>%
  group_by(GEOID10) %>%
  filter(intersection_area == max(intersection_area)) %>%
  ungroup() %>%
  select(GEOID10, mapname) %>%
  st_drop_geometry()

bgs_w_hood <- left_join(phl_bgs, max_intersection, by = c("GEOID10" = "GEOID10")) %>%
                st_drop_geometry()

phl_bgs <- left_join(phl_bgs, bgs_w_hood, by = "GEOID10")

### overlays---------------------------------
overlays_x_bg <- st_join(phl_bgs, overlays %>% select(overlay_symbol)) %>%
                      st_drop_geometry() %>%
                      mutate(overlay_present = 1) %>%
                      group_by(GEOID10, overlay_symbol) %>%
                      summarize(overlay_present = max(overlay_present, na.rm = TRUE), .groups = 'drop') %>%
                      pivot_wider(names_from = "overlay_symbol", values_from = "overlay_present", 
                                  names_prefix = "overlay_", values_fill = list("overlay_present" = 0))

phl_bgs <- left_join(phl_bgs, overlays_x_bg, by = "GEOID10")


### join back together----------------------
permits_bg <- left_join(permits_bg,
                     st_drop_geometry(phl_bgs),
                     by = "GEOID10")

### acs vars--------------------------------------------

permits_pre_2019 <- filter(permits_bg, year < 2019)
permits_post_2018 <- filter(permits_bg, year >= 2019)


permits_joined_pre_2019 <- left_join(permits_pre_2019, acs14_reg_vars, by = "GEOID10")
permits_joined_post_2018 <- left_join(permits_post_2018, acs19_reg_vars, by = "GEOID10")


permits_bg <- bind_rows(permits_joined_pre_2019, permits_joined_post_2018)

### clean-----------------------------------------------------
permits_bg <- permits_bg %>%
                select(-c(nb, wt, GEOID10)) %>%
                clean_names()
Show the code
tm <- tm_shape(permits_bg %>% filter(year != 2012)) +
        tm_polygons(col = "permits_count", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
  tm_facets(along = "year") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE) 

suppressMessages(
tmap_animation(tm, "assets/permits_animation.gif", delay = 50)
)

Philadelphia Building Permits, 2013 - 2022

3 Methods

3.1 Data

Show the code
ggplot(building_permits, aes(x = as.factor(year))) +
  geom_bar() +
  labs(title = "Permits per Year")

Show the code
ggplot(permits_bg %>% st_drop_geometry, aes(x = permits_count)) +
  geom_histogram() +
  labs(title = "Permits per Block Group per Year",
       subtitle = "Log-Transformed") +
  scale_x_log10() +
  facet_wrap(~year)

Show the code
# 
# tm_shape(permits_bg) +
#   tm_polygons("spat_lag_permits", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
#   tm_facets(along = "year") +
# tm_shape(broad_and_market) +
#   tm_lines(col = "lightgrey") +
#   tm_layout(frame = FALSE)

3.1.1 Feature Engineering

Show the code
permits_bg_long <- permits_bg %>%
                    st_drop_geometry() %>%
                    pivot_longer(
                      cols = c(starts_with("lag"), dist_to_2022),
                      names_to = "Variable",
                      values_to = "Value"
                    )


ggscatter(permits_bg_long, x = "permits_count", y = "Value", facet.by = "Variable",
   add = "reg.line",
   add.params = list(color = "blue", fill = "lightgray"),
   conf.int = TRUE
   ) + stat_cor(method = "pearson", p.accuracy = 0.001, r.accuracy = 0.01)

3.2 Models

4 Results

4.1 OLS Regression

To begin, we run a simple regression incorporating three engineered groups of features: space lag, time lag, and distance to 2022. We include this last variable because of a Philadelphia tax abatement policy that led to a significant increase in residential development in the years immediately before 2022. We will use this as a baseline model to compare to our more complex models.

Show the code
permits_train <- filter(permits_bg %>% select(-mapname), year < 2022)
permits_test <- filter(permits_bg %>% select(-mapname), year == 2022)

reg <- lm(permits_count ~ ., data = st_drop_geometry(permits_train))

predictions <- predict(reg, permits_test)
predictions <- cbind(permits_test, predictions)

predictions <- predictions %>%
                  mutate(abs_error = abs(permits_count - predictions),
                         pct_error = abs_error / permits_count)

ggplot(predictions, aes(x = permits_count, y = predictions)) +
  geom_point() +
  labs(title = "Predicted vs. Actual Permits",
       subtitle = "2022") +
  geom_smooth(method = "lm", se = FALSE)

Show the code
mae <- paste0("MAE: ", round(mean(predictions$abs_error, na.rm = TRUE), 2))

tm_shape(predictions) +
        tm_polygons(col = "abs_error", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE) 

4.2 Poisson Model

Given that we are dealing with a skewed distribution of count data

Show the code
poisson_reg <- glm(permits_count ~ ., 
                   family = poisson(link = "log"),
                   data = st_drop_geometry(permits_train))

poisson_predictions <- predict(poisson_reg, permits_test, type = "response")
poisson_predictions <- cbind(permits_test, poisson_predictions)

poisson_predictions <- poisson_predictions %>%
                           mutate(abs_error = abs(permits_count - poisson_predictions),
                                  pct_error = abs_error / permits_count)

ggplot(poisson_predictions, aes(x = permits_count, y = poisson_predictions)) +
  geom_point() +
  labs(title = "Predicted vs. Actual Permits",
       subtitle = "2022") +
  geom_smooth(method = "lm", se = FALSE)

Show the code
poisson_mae <- paste0("MAE: ", round(mean(poisson_predictions$abs_error, na.rm = TRUE), 2))

tm_shape(poisson_predictions) +
        tm_polygons(col = "abs_error", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

We find that our OLS model has an MAE of only MAE: 2.44–not bad for such a simple model! Still, it struggles most in the areas where we most need it to succeed, so we will try to introduce better variables and apply a more complex model to improve our predictions.

4.3 Random Forest Regression

Show the code
rf <- randomForest(permits_count ~ ., 
                   data = st_drop_geometry(permits_train),
                   importance = TRUE, 
                   na.action = na.omit)

rf_predictions <- predict(rf, permits_test)
rf_predictions <- cbind(permits_test, rf_predictions)
rf_predictions <- rf_predictions %>%
                  mutate(abs_error = abs(permits_count - rf_predictions),
                         pct_error = abs_error / (permits_count + 0.0001))

ggplot(rf_predictions, aes(x = permits_count, y = rf_predictions)) +
  geom_point() +
  labs(title = "Predicted vs. Actual Permits",
       subtitle = "2022") +
  geom_smooth(method = "lm", se = FALSE)

Show the code
rf_mae <- paste0("MAE: ", round(mean(rf_predictions$abs_error, na.rm = TRUE), 2))

tm_shape(rf_predictions) +
        tm_polygons(col = "abs_error", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE) 

5 Discussion

5.1 Accuracy

Predominately, our model overpredicts, which is better than underpredicting, as it facilitates new development.

Show the code
nbins <- as.integer(sqrt(nrow(rf_predictions)))
vline <- mean(rf_predictions$abs_error, na.rm = TRUE)

ggplot(rf_predictions, aes(x = abs_error)) +
  geom_histogram(bins = nbins) +
  geom_vline(aes(xintercept = vline))

5.2 Generalizabiltiy

Show the code
tm_shape(acs19) +
        tm_polygons(col = "Percent_Nonwhite", border.alpha = 0, palette = 'viridis', style = "fisher", colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

Show the code
rf_predictions <- rf_predictions %>%
                      mutate(race_comp = case_when(
                        percent_nonwhite >= .50 ~ "Majority Non-White",
                        TRUE ~ "Majority White"
                      ))

ggplot(rf_predictions, aes(y = abs_error, color = race_comp)) +
  geom_boxplot(fill = NA)

How does this generalize across neighborhoods?

How does this generalize across council districts?

5.3 Assessing Upzoning Needs

We can identify conflict between projected development and current zoning.

Look at zoning that is industrial or residential single family in areas that our model suggests are high development risk for 2023:

Show the code
filtered_zoning <- zoning %>%
                     filter(str_detect(CODE, "RS") | str_detect(CODE, "I"))


tm_shape(filtered_zoning) +
        tm_polygons(col = "CODE", border.alpha = 0, colorNA = "lightgrey") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

We can extract development predictions at the block level to these parcels and then visualize them by highest need.

Show the code
filtered_zoning <- st_join(
  filtered_zoning,
  rf_predictions %>% select(rf_predictions)
)

tm_shape(filtered_zoning) +
        tm_polygons(col = "rf_predictions", border.alpha = 0, colorNA = "lightgrey", palette = "viridis", style = "fisher") +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

Show the code
tmap_mode('view')

filtered_zoning %>%
  filter(rf_predictions > 10) %>%
tm_shape() +
        tm_polygons(col = "CODE", border.alpha = 0, colorNA = "lightgrey",
                    popup.vars = c('rf_predictions', 'CODE')) +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

Furthermore, we can identify properties with high potential for assemblage, which suggests the ability to accomodate high-density, multi-unit housing.

Show the code
nbs <- filtered_zoning %>% 
  mutate(nb = st_contiguity(geometry))

# Create edge list while handling cases with no neighbors
edge_list <- tibble::tibble(id = 1:length(nbs$nb), nbs = nbs$nb) %>% 
  tidyr::unnest(nbs) %>% 
  filter(nbs != 0)

# Create a graph with a node for each row in filtered_zoning
g <- make_empty_graph(n = nrow(filtered_zoning))
V(g)$name <- as.character(1:nrow(filtered_zoning))

# Add edges if they exist
if (nrow(edge_list) > 0) {
  edges <- as.matrix(edge_list)
  g <- add_edges(g, c(t(edges)))
}

# Calculate the number of contiguous neighbors, handling nodes without neighbors
n_contiguous <- sapply(V(g)$name, function(node) {
  if (node %in% edges) {
    length(neighborhood(g, order = 1, nodes = as.numeric(node))[[1]])
  } else {
    1  # Nodes without neighbors count as 1 (themselves)
  }
})

filtered_zoning <- filtered_zoning %>%
                    mutate(n_contig = n_contiguous)

filtered_zoning %>%
  st_drop_geometry() %>%
  select(rf_predictions,
         n_contig,
         OBJECTID,
         CODE) %>%
  filter(rf_predictions > 10,
         n_contig > 2) %>%
  kablerize(caption = "Poorly-Zoned Properties with High Development Risk")
Poorly-Zoned Properties with High Development Risk
rf_predictions n_contig OBJECTID CODE
275 12.16710 3 604 I2
858 21.58597 6 1524 ICMX
903 12.94683 6 1595 RSA5
916 21.58597 8 1615 ICMX
1522 11.24437 4 2559 RSA5
1631 21.58597 3 2736 IRMX
1643 21.58597 6 2756 ICMX
1670 21.58597 7 2803 ICMX
1671 21.58597 3 2804 IRMX
1865 34.38930 3 3128 IRMX
2099 12.94683 6 3492 RSA5
2251 11.42080 4 3744 IRMX
2685 12.94683 6 4533 RSA5
3168 10.58817 3 5506 RSA5
3439 21.58597 6 6067 ICMX
3554 12.94683 6 6289 RSA5
3616 21.58597 3 6405 RSA5
3836 12.94683 7 6869 ICMX
3846 34.38930 3 6901 ICMX
3865 12.94683 6 6943 RSA5
4155 29.69270 3 7646 ICMX
4178 11.21383 3 7704 IRMX
4593 12.94683 6 8805 RSA5
4697 23.03563 3 9094 RSA5
4752 16.08273 5 9244 IRMX
4800 10.58817 3 9371 ICMX
4910 21.58597 3 9662 RSA5
5211 35.68783 3 10411 ICMX
5212 35.68783 3 10412 RSA5
5213 35.68783 3 10413 ICMX
5345 18.21887 3 10760 IRMX
5501 39.00537 3 11150 ICMX
5506 35.68783 3 11161 RSA5
5616 11.24437 3 11449 RSA3
5996 11.00043 4 12339 I2
6206 11.00043 3 12808 RSA5
6258 11.00043 3 12932 ICMX
6312 16.08273 6 13058 ICMX
6412 23.99983 3 13314 IRMX
6703 22.58767 3 13979 I3
6705 11.00043 3 13981 RSA3
6720 13.26530 3 14019 RSA5
6793.2 11.00043 8 14223 I2
6794 22.58767 3 14224 ICMX
6808 22.58767 3 14257 I2
6855 11.00043 3 14373 RSA5
6864 11.00043 3 14401 RSA5
6865 11.00043 3 14402 RSA5
6965 13.26530 3 14649 ICMX
7014 12.94683 8 14748 ICMX
7192 11.24437 3 15167 RSA2
7446 39.00537 3 15720 I2
7487 11.00043 6 15820 RSA5
7637 13.26530 3 16181 RSA5
7885 39.00537 3 16719 RSA5
8105 15.75340 3 17170 ICMX
8132 13.26530 3 17217 ICMX
8220 15.48503 3 17410 RSA5
8543 13.56143 3 18033 RSD3
9075 13.56143 3 19078 RSA3
9362 11.24437 3 19593 RSA3
9607 21.58597 4 20075 ICMX
9802 12.16710 3 20444 RSA5
9854 13.56143 4 20536 RSA2
10377 13.26530 3 21529 ICMX
10651 13.56143 3 22004 RSD1
12877 29.69270 4 25778 RSA5
12929 11.24437 4 25868 RSA1
13164 13.56143 3 26249 RSD1
13494 10.85073 3 26759 RSA5
13599 11.24437 3 26935 RSA3
13817 11.24437 3 27284 RSA2
13892 11.24437 3 27394 RSD3
14110 13.26530 4 27776 I2
14158 16.06680 3 27871 IRMX
14719.2 21.58597 28 28949 I2
14719.3 12.94683 28 28949 I2
14720 12.94683 6 28950 RSA5
Show the code
tmap_mode('view')

filtered_zoning %>%
  select(rf_predictions,
         n_contig,
         OBJECTID,
         CODE) %>%
  filter(rf_predictions > 10,
         n_contig > 2) %>%
tm_shape() +
        tm_polygons(col = "rf_predictions", border.alpha = 0, colorNA = "lightgrey", palette = "viridis", style = "fisher",
                    popup.vars = c('rf_predictions', 'n_contig', 'CODE'), alpha = 0.5) +
  tm_shape(broad_and_market) +
  tm_lines(col = "lightgrey") +
  tm_layout(frame = FALSE)

6 Conclusion

7 Appendices